--- title: "Exploratory Data Analysis" author: "Pranav Patil" format: html: embed-resources: true ---
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv("data_final.csv")
print(data.head())
Unnamed: 0 WHO Region ISO3 WHO Country Name City or Locality \ 0 0 European Region ALB Albania Elbasan 1 1 European Region ALB Albania Elbasan 2 2 European Region AND Andorra Escaldes-Engordany 3 3 European Region AND Andorra Escaldes-Engordany 4 4 European Region AND Andorra Escaldes-Engordany Measurement Year NO2 (μg/m3) NO2 temporal coverage (%) Population \ 0 2015 23.96 97.853881 2880703.0 1 2016 26.26 96.049636 2876101.0 2 2012 31.64 100.000000 71013.0 3 2014 27.62 100.000000 71621.0 4 2015 26.65 94.554795 71746.0 Average NO2 0 23.96 1 26.26 2 31.64 3 27.62 4 26.65
print(data.describe())
Unnamed: 0 Measurement Year NO2 (μg/m3) \
count 16364.000000 16364.000000 16364.000000
mean 8181.500000 2015.587570 20.118549
std 4724.024238 2.722397 11.370591
min 0.000000 2010.000000 0.000000
25% 4090.750000 2014.000000 11.857500
50% 8181.500000 2016.000000 18.505000
75% 12272.250000 2018.000000 26.710000
max 16363.000000 2021.000000 89.770000
NO2 temporal coverage (%) Population Average NO2
count 16364.000000 1.636400e+04 16364.000000
mean 96.802395 1.433897e+08 20.118549
std 2.648411 3.385217e+08 4.475790
min 90.011000 3.702900e+04 1.080000
25% 94.931507 1.958872e+07 16.986737
50% 97.350000 5.972908e+07 20.023105
75% 99.132420 6.715835e+07 23.060341
max 100.000000 1.383112e+09 59.820000
fig = px.scatter(data, x="Measurement Year", y="NO2 (μg/m3)", color="WHO Country Name",
title="NO2 Levels Over the Years", labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)"})
fig.show()
avg_no2_by_country = data.groupby("WHO Country Name")["Average NO2"].mean().reset_index()
fig = px.bar(avg_no2_by_country, x="WHO Country Name", y="Average NO2",
color="WHO Country Name", title="Average NO2 Levels by Country",
labels={"Average NO2": "Average NO2 Levels (μg/m3)"})
fig.show()
avg_no2_by_region = data.groupby("WHO Region")["Average NO2"].mean().reset_index()
fig = px.bar(avg_no2_by_region, x="WHO Region", y="Average NO2",
color="WHO Region", title="Average NO2 Levels by Region",
labels={"Average NO2": "Average NO2 Levels (μg/m3)"})
fig.show()
avg_no2_by_city = data.groupby("City or Locality")["Average NO2"].mean().reset_index()
fig = px.bar(avg_no2_by_city, x="City or Locality", y="Average NO2",
color="City or Locality", title="Average NO2 Levels by City",
labels={"Average NO2": "Average NO2 Levels (μg/m3)"})
fig.show()
fig = px.imshow(data.pivot_table(values="NO2 (μg/m3)", index="WHO Country Name", columns="Measurement Year"),
labels={"index": "Country", "columns": "Year", "value": "NO2 Levels (μg/m3)"},
title="NO2 Levels by Country and Year")
fig.show()
fig = px.imshow(data.pivot_table(values="NO2 (μg/m3)", index="WHO Region", columns="Measurement Year"),
labels={"index": "Region", "columns": "Year", "value": "NO2 Levels (μg/m3)"},
title="NO2 Levels by Region and Year")
fig.show()
fig = px.imshow(data.pivot_table(values="NO2 (μg/m3)", index="City or Locality", columns="Measurement Year"),
labels={"index": "City or Locality", "columns": "Year", "value": "NO2 Levels (μg/m3)"},
title="NO2 Levels by City and Year")
fig.show()
# Filtering data for Eastern Mediterranean Region
eastern_med_data = data[data['WHO Region'] == 'Eastern Mediterranean Region']
fig = px.imshow(eastern_med_data.pivot_table(values="NO2 (μg/m3)", index="City or Locality", columns="Measurement Year"),
labels={"index": "City", "columns": "Year", "value": "NO2 Levels (μg/m3)"},
title="NO2 Levels by City and Year in Eastern Mediterranean Region")
fig.show()
# Filtering data for European Region
eu_data = data[data['WHO Region'] == 'European Region']
fig = px.imshow(eu_data.pivot_table(values="NO2 (μg/m3)", index="City or Locality", columns="Measurement Year"),
labels={"index": "City", "columns": "Year", "value": "NO2 Levels (μg/m3)"},
title="NO2 Levels by City and Year in European Region")
fig.show()
# Filtering data for Region of the Americas
reg_amer_data = data[data['WHO Region'] == 'Region of the Americas']
fig = px.imshow(reg_amer_data.pivot_table(values="NO2 (μg/m3)", index="City or Locality", columns="Measurement Year"),
labels={"index": "City", "columns": "Year", "value": "NO2 Levels (μg/m3)"},
title="NO2 Levels by City and Year in Region of the Americas")
fig.show()
# Filtering data for South East Asia Region
se_asia_data = data[data['WHO Region'] == 'South East Asia Region']
fig = px.imshow(se_asia_data.pivot_table(values="NO2 (μg/m3)", index="City or Locality", columns="Measurement Year"),
labels={"index": "City", "columns": "Year", "value": "NO2 Levels (μg/m3)"},
title="NO2 Levels by City and Year in South East Asia Region")
fig.show()
# Filtering data for Western Pacific Region
western_pac_data = data[data['WHO Region'] == 'Western Pacific Region']
fig = px.imshow(western_pac_data.pivot_table(values="NO2 (μg/m3)", index="City or Locality", columns="Measurement Year"),
labels={"index": "City", "columns": "Year", "value": "NO2 Levels (μg/m3)"},
title="NO2 Levels by City and Year in Western Pacific Region")
fig.show()
fig = px.scatter(eastern_med_data, x="Measurement Year", y="City or Locality", size="Population",
color="NO2 (μg/m3)", hover_name="City or Locality",
title="NO2 Levels by City and Year in Eastern Mediterranean Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Population": "Population"},
size_max=30)
fig.show()
fig = px.scatter(eu_data, x="Measurement Year", y="City or Locality", size="Population",
color="NO2 (μg/m3)", hover_name="City or Locality",
title="NO2 Levels by City and Year in European Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Population": "Population"},
size_max=30)
fig.show()
fig = px.scatter(reg_amer_data, x="Measurement Year", y="City or Locality", size="Population",
color="NO2 (μg/m3)", hover_name="City or Locality",
title="NO2 Levels by City and Year in Region of the Americas",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Population": "Population"},
size_max=30)
fig.show()
fig = px.scatter(se_asia_data, x="Measurement Year", y="City or Locality", size="Population",
color="NO2 (μg/m3)", hover_name="City or Locality",
title="NO2 Levels by City and Year in South East Asia Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Population": "Population"},
size_max=30)
fig.show()
fig = px.scatter(western_pac_data, x="Measurement Year", y="City or Locality", size="Population",
color="NO2 (μg/m3)", hover_name="City or Locality",
title="NO2 Levels by City and Year in Western Pacific Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Population": "Population"},
size_max=30)
fig.show()
fig = px.choropleth(data,
locations="ISO3",
color="Average NO2",
hover_name="WHO Country Name",
animation_frame="Measurement Year",
color_continuous_scale=px.colors.sequential.Plasma,
projection="natural earth",
title="NO2 Levels by Country and Year in Each WHO Region",
labels={"Average NO2": "Average NO2 Levels (μg/m3)"})
fig.show()
fig = px.violin(data,
x="WHO Region",
y="NO2 (μg/m3)",
box=True, # Include box plot inside the violin
points="all", # Display individual data points
color="WHO Region",
hover_name="WHO Country Name",
title="Violin Plot of NO2 Levels by Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)"}
)
fig.show()
fig = px.violin(eastern_med_data,
x="Measurement Year",
y="NO2 (μg/m3)",
#color="City or Locality",
box=True, # Include box plot inside the violin
points="all", # Display individual data points
hover_name="City or Locality",
title="Violin Plot of NO2 Levels by Year and City in Eastern Mediterranean Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Measurement Year": "Year"}
)
# Show the plot
fig.show()
fig = px.violin(eu_data,
x="Measurement Year",
y="NO2 (μg/m3)",
#color="City or Locality",
box=True, # Include box plot inside the violin
points="all", # Display individual data points
hover_name="City or Locality",
title="Violin Plot of NO2 Levels by Year and City in European Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Measurement Year": "Year"}
)
# Show the plot
fig.show()
fig = px.violin(reg_amer_data,
x="Measurement Year",
y="NO2 (μg/m3)",
#color="City or Locality",
box=True, # Include box plot inside the violin
points="all", # Display individual data points
hover_name="City or Locality",
title="Violin Plot of NO2 Levels by Year and City in Region of the Americas",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Measurement Year": "Year"}
)
# Show the plot
fig.show()
fig = px.violin(se_asia_data,
x="Measurement Year",
y="NO2 (μg/m3)",
#color="City or Locality",
box=True, # Include box plot inside the violin
points="all", # Display individual data points
hover_name="City or Locality",
title="Violin Plot of NO2 Levels by Year and City in South East Asia Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Measurement Year": "Year"}
)
# Show the plot
fig.show()
fig = px.violin(western_pac_data,
x="Measurement Year",
y="NO2 (μg/m3)",
#color="City or Locality",
box=True, # Include box plot inside the violin
points="all", # Display individual data points
hover_name="City or Locality",
title="Violin Plot of NO2 Levels by Year and City in Western Pacific Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Measurement Year": "Year"}
)
# Show the plot
fig.show()
fig = px.box(data,
x="WHO Region",
y="NO2 (μg/m3)",
points="all", # Display individual data points
hover_name="WHO Country Name",
color="WHO Region",
title="Box Plot of NO2 Levels by Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)"}
)
# Show the plot
fig.show()
fig = px.box(eastern_med_data,
x="Measurement Year",
y="NO2 (μg/m3)",
#color="City or Locality",
points="all", # Display individual data points
hover_name="City or Locality",
title="Box Plot of NO2 Levels by Year and City in Eastern Mediterranean Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Measurement Year": "Year"}
)
# Show the plot
fig.show()
fig = px.box(eu_data,
x="Measurement Year",
y="NO2 (μg/m3)",
#color="City or Locality",
points="all", # Display individual data points
hover_name="City or Locality",
title="Box Plot of NO2 Levels by Year and City in European Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Measurement Year": "Year"}
)
# Show the plot
fig.show()
fig = px.box(reg_amer_data,
x="Measurement Year",
y="NO2 (μg/m3)",
#color="City or Locality",
points="all", # Display individual data points
hover_name="City or Locality",
title="Box Plot of NO2 Levels by Year and City in Region of the Americas",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Measurement Year": "Year"}
)
# Show the plot
fig.show()
fig = px.box(se_asia_data,
x="Measurement Year",
y="NO2 (μg/m3)",
#color="City or Locality",
points="all", # Display individual data points
hover_name="City or Locality",
title="Box Plot of NO2 Levels by Year and City in South East Asia Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Measurement Year": "Year"}
)
# Show the plot
fig.show()
fig = px.box(western_pac_data,
x="Measurement Year",
y="NO2 (μg/m3)",
#color="City or Locality",
points="all", # Display individual data points
hover_name="City or Locality",
title="Box Plot of NO2 Levels by Year and City in Western Pacific Region",
labels={"NO2 (μg/m3)": "NO2 Levels (μg/m3)", "Measurement Year": "Year"}
)
# Show the plot
fig.show()
# Aggregate data to get total NO2 levels for each region
region_totals = data.groupby("WHO Region")["NO2 (μg/m3)"].sum().reset_index()
fig = px.pie(region_totals,
names='WHO Region',
values='NO2 (μg/m3)',
title='Total NO2 Levels by Region',
labels={'NO2 (μg/m3)': 'Total NO2 Levels (μg/m3)', 'WHO Region': 'Region'})
# Show the plot
fig.show()
count_df = data.groupby(['Measurement Year', 'WHO Country Name']).size().reset_index(name='Frequency')
# Display the resulting data frame
print(count_df)
Measurement Year WHO Country Name Frequency 0 2010 Austria 103 1 2010 Belgium 41 2 2010 Bulgaria 16 3 2010 Croatia 4 4 2010 Cyprus 2 .. ... ... ... 339 2019 United Kingdom 98 340 2020 Kuwait 5 341 2020 Qatar 1 342 2020 Singapore 1 343 2021 Qatar 1 [344 rows x 3 columns]
fig = px.bar(count_df, x='WHO Country Name', y='Frequency', color='Measurement Year',
title='Frequency of Each Combination of Year and Country',
labels={'Frequency': 'Count', 'WHO Country Name': 'Country', 'Measurement Year': 'Year'})
# Show the plot
fig.show()
fig = px.line(count_df, x='Measurement Year', y='Frequency', color='WHO Country Name',
title='Frequency of Measurements Over the Years by Country',
labels={'Frequency': 'Count', 'WHO Country Name': 'Country', 'Measurement Year': 'Year'})
fig.show()
fig = px.box(count_df, x='Measurement Year', y='Frequency',
title='Distribution of Frequency Across Different Years',
labels={'Frequency': 'Count', 'Measurement Year': 'Year'})
fig.show()
fig = px.scatter(count_df, x='Measurement Year', y='Frequency', color='WHO Country Name',
title='Scatter Plot of Frequency Over the Years by Country',
labels={'Frequency': 'Count', 'WHO Country Name': 'Country', 'Measurement Year': 'Year'})
fig.show()
fig = px.imshow(count_df.pivot_table(values='Frequency', index='WHO Country Name', columns='Measurement Year'),
labels={'index': 'Country', 'columns': 'Year', 'value': 'Frequency'},
title='Heatmap of Distribution of Measurements by Year and Country')
fig.show()
fig = px.choropleth(count_df,
locations='WHO Country Name',
locationmode='country names',
color='Frequency',
animation_frame='Measurement Year',
title='Choropleth Map of Frequency of Measurements by Country for Each Year',
color_continuous_scale=px.colors.sequential.Plasma,
labels={'Frequency': 'Count', 'Measurement Year': 'Year'})
# Show the plot
fig.show()
fig = px.violin(count_df,
x='Measurement Year',
y='Frequency',
box=True, # Include box plot inside the violin
points="all", # Display individual data points
title='Violin Plot of Frequency Measurements by Year',
labels={'Frequency': 'Count', 'Measurement Year': 'Year'})
# Show the plot
fig.show()
fig = px.pie(count_df,
names='WHO Country Name',
values='Frequency',
title='Distribution of Frequencies Across Countries',
labels={'Frequency': 'Count', 'WHO Country Name': 'Country'})
fig.show()
region_df = data.groupby(['Measurement Year', 'WHO Country Name', 'WHO Region']).size().reset_index(name='Frequency')
region_df.head()
| Measurement Year | WHO Country Name | WHO Region | Frequency | |
|---|---|---|---|---|
| 0 | 2010 | Austria | European Region | 103 |
| 1 | 2010 | Belgium | European Region | 41 |
| 2 | 2010 | Bulgaria | European Region | 16 |
| 3 | 2010 | Croatia | European Region | 4 |
| 4 | 2010 | Cyprus | European Region | 2 |
fig = px.violin(region_df,
x='WHO Region',
y='Frequency',
box=True, # Include box plot inside the violin
points="all", # Display individual data points
title='Violin Plot of Frequency Distribution by Region',
labels={'Frequency': 'Count', 'WHO Region': 'Region'})
# Show the plot
fig.show()
fig = px.box(region_df,
x='WHO Region',
y='Frequency',
points="all", # Display individual data points
title='Box Plot of Frequency Distribution by Region',
labels={'Frequency': 'Count', 'WHO Region': 'Region'})
# Show the plot
fig.show()
total_freq_by_region = region_df.groupby('WHO Region')['Frequency'].sum().reset_index()
fig = px.bar(total_freq_by_region,
x='WHO Region',
y='Frequency',
color='WHO Region',
title='Total Frequency by Region',
labels={'Frequency': 'Total Count', 'WHO Region': 'Region'})
# Show the plot
fig.show()
fig = px.scatter(region_df,
x='WHO Region',
y='Frequency',
size='Frequency', # Size of the bubble represents the count
color='WHO Region',
hover_name='WHO Country Name',
title='Bubble Chart of Frequency Distribution by Region',
labels={'Frequency': 'Count', 'WHO Region': 'Region'})
fig.show()
# Aggregate data to get total frequencies for each region
total_freq_by_region = region_df.groupby('WHO Region')['Frequency'].sum().reset_index()
fig = px.pie(total_freq_by_region,
names='WHO Region',
values='Frequency',
title='Pie Chart of Frequency Distribution by Region',
labels={'Frequency': 'Total Count', 'WHO Region': 'Region'})
fig.show()
fig = px.imshow(region_df.pivot_table(values='Frequency', index='WHO Region', columns='Measurement Year'),
labels={'index': 'Region', 'columns': 'Year', 'value': 'Frequency'},
title='Heatmap of Frequency Distribution by Region and Year')
# Show the plot
fig.show()
fig = px.scatter_matrix(data, dimensions=["Measurement Year", "NO2 (μg/m3)", "NO2 temporal coverage (%)", "Population", "Average NO2"])
# Show the plot
fig.show()
/Users/pranavpatil/anaconda3/lib/python3.11/site-packages/plotly/express/_core.py:279: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
fig = px.scatter_matrix(
data,
dimensions=["NO2 (μg/m3)", "NO2 temporal coverage (%)", "Population", "Average NO2"],
color="WHO Region",
symbol="WHO Country Name",
labels={"NO2 (μg/m3)": "NO2 Concentration (μg/m3)"},
title="Pair Plot of NO2 Data",
height=800,
width=800
)
fig.update_traces(diagonal_visible=False)
fig.show()
/Users/pranavpatil/anaconda3/lib/python3.11/site-packages/plotly/express/_core.py:279: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.